In [6]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
pandas is an open source, BSD-licensed library providing high-performance, easy-to-use data structures and data analysis tools for the Python programming language.
This tutorial pulls from the Pandas website and the Handson-ML tutorial: https://github.com/ageron/handson-ml and the Pandas's documentation tutorial
See: https://pandas.pydata.org/pandas-docs/stable/dsintro.html
In [20]:
v = pd.Series(np.random.randn(5))
v
Out[20]:
Index is defaulted to start at 0
In [5]:
index = [1,2,3,4,5]
letters = ['a', 'b', 'c', 'd', 'e']
v = pd.Series(letters, index=index)
v
Out[5]:
In [21]:
type(v)
Out[21]:
In [22]:
d = {'a' : 0., 'b' : 1., 'c' : 2.} # dictionary object
d
Out[22]:
In [23]:
d_v = pd.Series(d)
d_v
Out[23]:
Convert dictionary to series, and set the key as the index and the value as the data point
In [24]:
d_v['b'] # by index value
Out[24]:
In [25]:
d_v[2] # by row position
Out[25]:
In [26]:
pd.Series(d, index=['b', 'c', 'd', 'a'])
Out[26]:
In [27]:
pd.Series(5., index=['a', 'b', 'c', 'd', 'e'])
Out[27]:
In [9]:
temperatures = [4.4,5.1,6.1,6.2,6.1,6.1,5.7,5.2,4.7,4.1,3.9,3.5]
s7 = pd.Series(temperatures, name="Temperature")
s7.plot()
plt.show()
In [10]:
dates = pd.date_range('2016/10/29 5:30pm', periods=12, freq='H')
dates
Out[10]:
In [11]:
temp_series = pd.Series(temperatures, dates)
temp_series # where dates is the index
Out[11]:
In [12]:
temp_series.plot(kind="bar")
plt.grid(True)
plt.show()
In [13]:
temp_series.plot(kind="line")
plt.show()
In [115]:
temp_series
Out[115]:
In [116]:
temp_series.shape
Out[116]:
In [126]:
temp_series.size
Out[126]:
In [120]:
temp_series.dtype
Out[120]:
In [121]:
temp_series.hasnans # Does the series have NaN values?
Out[121]:
In [122]:
temp_series.values
Out[122]:
In [130]:
ones = np.ones(temp_series.size)
temp_series.add(ones)
Out[130]:
In [131]:
def square(value):
return value * value
In [132]:
temp_series.apply(square) # apply method over all cells in a Series object
Out[132]:
In [136]:
temp_series.at_time('17:30')
Out[136]:
In [140]:
temp_series.between_time(start_time='17:30', end_time='19:30')
Out[140]:
In [141]:
temp_series.describe()
Out[141]:
In [143]:
temp_series.head(3)
Out[143]:
In [150]:
for item in temp_series.items():
print("Time: {}, Value: {}".format(item[0], item[1]))
In [151]:
temp_series.mode()
Out[151]:
In [152]:
temp_series.value_counts()
Out[152]:
In [153]:
temp_series.sort_values()
Out[153]:
In [154]:
temp_series.sort_index()
Out[154]:
In [155]:
temp_series.to_dict()
Out[155]:
In [160]:
temp_series.sample(frac=.25, random_state=42) # return 25% of set
Out[160]:
Dataframes are a 2-dimensional labeled data structure with columns of potentially different types
In [15]:
people_dict = {
"weight": pd.Series([68, 83, 112], index=["alice", "bob", "charles"]),
"birthyear": pd.Series([1984, 1985, 1992], index=["bob", "alice", "charles"], name="year"),
"children": pd.Series([0, 3], index=["charles", "bob"]),
"hobby": pd.Series(["Biking", "Dancing"], index=["alice", "bob"]),
}
people = pd.DataFrame(people_dict)
people
Out[15]:
Transpose
In [16]:
people.T
Out[16]:
In [17]:
people
Out[17]:
In [161]:
people.head(3)
Out[161]:
In [162]:
people.tail(2)
Out[162]:
In [163]:
people.T
Out[163]:
In [164]:
people.shape
Out[164]:
In [166]:
people.corr()
Out[166]:
In [171]:
import seaborn as sns
import matplotlib
%matplotlib inline
In [172]:
sns.heatmap(people.corr())
Out[172]:
Referencing a named index
loc is used for label-based indexing
In [24]:
people.loc['charles']
Out[24]:
Referencing a row number
iloc is used for position-based indexing
In [25]:
people.iloc[2,]
Out[25]:
In [26]:
people.iloc[2:,]
Out[26]:
In [42]:
people['charles'] # error
Referencing a named index
In [40]:
people[['weight']]
Out[40]:
In [57]:
people.loc[:,'weight']
Out[57]:
Referencing a row number
In [43]:
people.iloc[:,3] # 0-index based
Out[43]:
In [53]:
people.ix[3]
In [67]:
people.iloc[:,3]['alice'] # 0-index based
Out[67]:
In [60]:
people
Out[60]:
In [62]:
people.iloc[1,0] # Unnamed index, column
Out[62]:
In [63]:
people.loc['bob', 'birthyear'] # Named index, column
Out[63]:
In [68]:
people.iloc[:,0]['bob'] # Named index, unnamed column
Out[68]:
In [70]:
people.loc['bob', :][0] # Named index, unnamed column
Out[70]:
In [77]:
people.iloc[1,:]['birthyear'] # Unnamed index, named column
Out[77]:
In [79]:
people.loc[:,'birthyear'][1] # Unnamed index, named column
Out[79]:
In [71]:
people.iloc[:,0][1] # Unnamed index, unnamed column
Out[71]:
In [72]:
people.loc['bob', :]['birthyear'] # Named index, named column
Out[72]:
In [45]:
people.iloc[1:3] # return slice of rows, from 2-3
Out[45]:
In [46]:
people[people["birthyear"] < 1990]
Out[46]:
people["age"] = 2016 - people["birthyear"] # adds a new column "age" people["over 30"] = people["age"] > 30 # adds another column "over 30" birthyears = people.pop("birthyear") del people["children"]
people
In [48]:
birthyears
Out[48]:
In [84]:
people["pets"] = pd.Series({"bob": 0, "charles": 5, "eugene":1}) # alice is missing, eugene is ignored
people
Out[84]:
In [85]:
people.insert(1, "height", [172, 181, 185])
people
Out[85]:
In [86]:
people.plot(kind = "scatter", x = "height", y = "weight", s=[40, 120, 200])
plt.show()
In [ ]:
In [87]:
people.assign(
body_mass_index = people["weight"] / (people["height"] / 100) ** 2,
has_pets = people["pets"] > 0
)
Out[87]:
In [106]:
# Let's look at people again,
people
Out[106]:
In [54]:
people.info()
Descriptive Statistics
In [88]:
people.describe(include='all')
Out[88]:
In [92]:
people.height.min()
Out[92]:
In [94]:
people.children.max()
Out[94]: